{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://cims.nyu.edu/~sbowman/multinli/multinli_1.0.zip\n",
    "# !unzip -o multinli_1.0.zip"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "from unidecode import unidecode\n",
    "\n",
    "def cleaning(string):\n",
    "    return re.sub(r'[ ]+', ' ', unidecode(string)).strip()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['multinli_1.0/multinli_1.0_dev_mismatched.jsonl',\n",
       " 'multinli_1.0/multinli_1.0_train.jsonl',\n",
       " 'multinli_1.0/multinli_1.0_dev_matched.jsonl']"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from glob import glob\n",
    "\n",
    "files = glob('multinli_1.0/multinli_1.0_*.jsonl')\n",
    "files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "with open(files[1]) as fopen:\n",
    "    train = fopen.read().split('\\n')\n",
    "    \n",
    "with open(files[0]) as fopen:\n",
    "    dev = fopen.read().split('\\n')\n",
    "    \n",
    "with open(files[2]) as fopen:\n",
    "    dev.extend(fopen.read().split('\\n'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "labels = ['contradiction', 'entailment']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 392703/392703 [00:03<00:00, 115787.44it/s]\n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "\n",
    "train_X, train_Y = [], []\n",
    "\n",
    "for i in tqdm(range(len(train))):\n",
    "    try:\n",
    "        l = json.loads(train[i])\n",
    "        if l['gold_label'] not in labels:\n",
    "            continue\n",
    "        if len(l['sentence1']) and len(l['sentence2']):\n",
    "            s = f\"{l['sentence1']} <> {l['sentence2']}\"\n",
    "            train_X.append(s)\n",
    "            train_Y.append(l['gold_label'])\n",
    "    except:\n",
    "        pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 20002/20002 [00:00<00:00, 93673.10it/s]\n"
     ]
    }
   ],
   "source": [
    "test_X, test_Y = [], []\n",
    "\n",
    "for i in tqdm(range(len(dev))):\n",
    "    try:\n",
    "        l = json.loads(dev[i])\n",
    "        if l['gold_label'] not in labels:\n",
    "            continue\n",
    "        if len(l['sentence1']) and len(l['sentence2']):\n",
    "            s = f\"{l['sentence1']} <> {l['sentence2']}\"\n",
    "            test_X.append(s)\n",
    "            test_Y.append(l['gold_label'])\n",
    "    except:\n",
    "        pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "import youtokentome as yttm\n",
    "\n",
    "with open('out.txt', 'w') as fopen:\n",
    "    fopen.write('\\n'.join(test_X + train_X))\n",
    "    \n",
    "yttm.BPE.train(data='out.txt', vocab_size=30000, model='vocab.model')\n",
    "bpe = yttm.BPE(model='vocab.model')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['<PAD>', '<UNK>', '<BOS>', '<EOS>']"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "bpe.vocab()[:4]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['halo<BOS> halo']"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "bpe.decode(bpe.encode('halo') + [2] + bpe.encode('halo'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 261802/261802 [00:09<00:00, 26791.84it/s]\n"
     ]
    }
   ],
   "source": [
    "left_train, right_train, label_train = [], [], []\n",
    "\n",
    "for i in tqdm(range(len(train_X))):\n",
    "    l, r = train_X[i].split(' <> ')\n",
    "    left_train.append(bpe.encode(l))\n",
    "    right_train.append(bpe.encode(r))\n",
    "    label_train.append(labels.index(train_Y[i]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 13395/13395 [00:00<00:00, 29595.87it/s]\n"
     ]
    }
   ],
   "source": [
    "left_test, right_test, label_test = [], [], []\n",
    "\n",
    "for i in tqdm(range(len(test_X))):\n",
    "    l, r = test_X[i].split(' <> ')\n",
    "    try:\n",
    "        label_test.append(labels.index(test_Y[i]))\n",
    "        left_test.append(bpe.encode(l))\n",
    "        right_test.append(bpe.encode(r))\n",
    "    except:\n",
    "        pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('contrastive.json', 'w') as fopen:\n",
    "    json.dump({'left_train': left_train,\n",
    "              'right_train': right_train,\n",
    "              'label_train': label_train,\n",
    "              'left_test': left_test,\n",
    "              'right_test': right_test,\n",
    "              'label_test': label_test}, fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 261802/261802 [00:09<00:00, 26215.21it/s]\n"
     ]
    }
   ],
   "source": [
    "left_train, label_train = [], []\n",
    "\n",
    "for i in tqdm(range(len(train_X))):\n",
    "    l, r = train_X[i].split(' <> ')\n",
    "    left_train.append(bpe.encode(l) + [2] + bpe.encode(r))\n",
    "    label_train.append(labels.index(train_Y[i]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 13395/13395 [00:00<00:00, 13604.82it/s]\n"
     ]
    }
   ],
   "source": [
    "left_test, label_test = [], []\n",
    "\n",
    "for i in tqdm(range(len(test_X))):\n",
    "    try:\n",
    "        l, r = test_X[i].split(' <> ')\n",
    "        label_test.append(labels.index(test_Y[i]))\n",
    "        left_test.append(bpe.encode(l) + [2] + bpe.encode(r))\n",
    "    except:\n",
    "        pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('pair.json', 'w') as fopen:\n",
    "    json.dump({'left_train': left_train,\n",
    "              'label_train': label_train,\n",
    "              'left_test': left_test,\n",
    "              'label_test': label_test}, fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('text.json', 'w') as fopen:\n",
    "    json.dump({'train_X': train_X,\n",
    "              'train_Y': train_Y,\n",
    "              'test_X': test_X,\n",
    "              'test_Y': test_Y}, fopen)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
